Negativicutes example
= Mount shared folder, VM = sudo mkdir /media/BiotoolsShared-virtual sudo mount -t vboxsf myShare /media/BiotoolsShared-virtual *remove zero length entries from FASTA foreach i (*.fsa ) echo $i saco_convert -I Fasta -O tab $i > $i:r.tab sed -r '/\t\t\t/d' $i:r.tab > $i.temp.tab saco_convert -I tab -O Fasta $i.temp.tab > $i rm *tab end = Download genomes = *Download genomes from ACC or WGS numbers for x in CP001859 CP003058 FP929048 HE576794 CP002637 CP001820 ACGB00000000 AFHQ00000000 ACIM00000000 AFBB00000000 AENT00000000 ADGP00000000 AECS00000000 AFUG00000000 AFIJ00000000 ABWK00000000 AEVN00000000 AECV00000000 ACLA00000000 ACKT00000000 AENV00000000 AEEJ00000000 ACKP00000000 AAWL00000000 AEDR00000000 AEDS00000000 ACIK00000000 ADFU00000000 ADCV00000000 ADCW00000000 AENU00000000 AFUJ00000000 do getgbk -a $x > $x.gbk done *Rename files after organism name in GenBank file and ACC or WGS number *Procedure copies files for x in *gbk do extractname $x done *Remove files named only as ACC or WGS numbers. *Procedure leaves files with organism names for x in CP001859 CP003058 FP929048 HE576794 CP002637 CP001820 ACGB00000000 AFHQ00000000 ACIM00000000 AFBB00000000 AENT00000000 ADGP00000000 AECS00000000 AFUG00000000 AFIJ00000000 ABWK00000000 AEVN00000000 AECV00000000 ACLA00000000 ACKT00000000 AENV00000000 AEEJ00000000 ACKP00000000 AAWL00000000 AEDR00000000 AEDS00000000 ACIK00000000 ADFU00000000 ADCV00000000 ADCW00000000 AENU00000000 AFUJ00000000 do rm $x.gbk done = Extract DNA from GenBank = *Extract DNA from Origin field of GenBank file for x in *gbk do saco_convert -I genbank -O fasta $x > $x.fna done *Rename files from *gbk.fna to *.fna for x in *gbk.fna do newx=`echo $x | sed "s/.gbk.fna/.fna/"` mv $x $newx done = 16S rRNA analysis = *Search DNA string for patterns similar to rRNA models in RNAmmer for x in *fna do echo $x rnammer -S bac -m ssu -f $x.rrna $x done *Count total number of sequences in a set of rrna files: grep -c ">" *rrna | awk 'BEGIN{FS=":"}{sum += $2}END { print sum}' *Rename files from *fna.rrna to *.rrna for x in *fna.rrna do newx=`echo $x | sed "s/.fna.rrna/.rrna/"` mv $x $newx done *Extract one 16S rRNA sequence from each genome *Goes trough each file in the directory called *rrna select16SrRNA -out all.fna Selecting the sequence with highest score and length between 1400 and 2000 Default is 1400 - 2000 base pairs, change using -min and -max Program is writing to file called all.fna Default output file is selected16SrRNA.fna, change using -out Number of files to be evaluated (extension *.rrna): 32 Number of files with no sequences: 2 Score: 1920.1 :: Length 1545 :: Acidaminococcus_sp_D21_ID_ACGB00000000.rrna Score: 1836.1 :: Length 1557 :: Dialister_invisus_DSM_15470_ID_ACIM00000000.rrna Score: 1878.8 :: Length 1555 :: Dialister_micraerophilus_DSM_19965_ID_AFBB00000000.rrna Unacceptable length: 1325 :: /molecule=16s_rRNA /score=1197.2 :: Dialister_microaerophilus_UPII_345-E_ID_AENT00000000.rrna ............ *Using default setting, 6 sequence do not have the right length (1400-1800) select16SrRNA -out all.fna | grep Unacc | wc = 6 *Using a smaller minimum length, all 29 genomes have an acceptable 16S rRNA sequence select16SrRNA -out all.fna -min 1100 | grep -c Unacc = 0 *Performing a multiple alignment of the selected sequences: clustalw all.fna clustalw all.fna -bootstrap=1000 njplot all.phb *Save file as post script Postscript file was made pretty manually in texteditor = Genome atlas = *Atlases were created for complete genomes= *For each atlas a folder was created to contain the files for that atlas *The files are saved but can be deleted as soon as the atlas has been drawn mkdir GenomeAtlas Vparvula sed s/XXXX/Veillonella_parvula_DSM_2008_ID_CP001820/g /usr/biotools/genomeAtlas > Vparvula.genomeatlas.sh chmod +x Vparvula.genomeatlas.sh ./Vparvula.genomeatlas.sh = Identify open readingframes in DNA using Prodigal = for x in *fna do prodigalrunner $x done = Bias in third codon position and Amino acid and codon usage = *Calculate AT and amino acid usage in open reading frame files, FASTA format for i in *orf.fna do perl /usr/biotools/indirect/atStats.pl $i > $i.atStats.tab cat $i.atStats.tab > $i.CodonAaUsage perl /usr/biotools/indirect/CodonAaUsage.pl $i >> $i.CodonAaUsage rm $i.atStats.tab done grep aa *AaUsage > aaUsage.all sed -i s/_prodigal.orf.fna.CodonAaUsage:aa//g aaUsage.all grep Total *AaUsage > statistics.all sed -i 's/_prodigal.orf.fna.CodonAaUsage:/\t/g' statistics.all cut -f2,3,4,5,6,7,8 statistics.all > tmp.all mv tmp.all statistics.all sed -i 's/_prodigal.orf.fna//g' statistics.all grep codon *AaUsage > codonUsage.all sed -i s/_prodigal.orf.fna.CodonAaUsage:codon//g codonUsage.all = Drawing heatmaps of Bias in third codon position and Amino acid and codon usage = install.packages("gplots") library(gplots) codon <- read.table("codonUsage.all") colnames(codon) <- c( 'Name', 'codon', 'score', 'count') codon <- codon1:3 test <- reshape(codon, idvar="Name", timevar="codon", direction="wide") codonMatrix <- data.matrix(test2:length(test)) rownames(codonMatrix) <- test$Name codon_heatmap <- heatmap.2(codonMatrix, scale="none", main="Codon usage", xlab="Codon fraction", ylab="Organism", trace="none", margins=c(8, 25)) dev.print(postscript, "codonUsage.ps", width = 25, height=25) dev.off() library(gplots) aa <- read.table("aaUsage.all") colnames(aa) <- c( 'Name', 'aa', 'score') test <- reshape(aa, idvar="Name", timevar="aa", direction="wide") aaMatrix <- data.matrix(test2:length(test)) rownames(aaMatrix) <- test$Name stat_heatmap <- heatmap.2(aaMatrix, scale="none", main="Amino acid usage", xlab="Amino acid fraction", ylab="Organism", trace="none", margins=c(8, 25), col = cm.colors(256)) dev.print(postscript, "aaUsage.ps", width = 25, height=25) dev.off() = Bar and rose plots for amino acid, codon usage and bias in third position = for x in *orf.fna do basicgenomeanalysis $x /usr/bin/gnuplot done = BLAST martix = *Input file is created in directory with protein files in FASTA format *Takes all fsa files in directory makebmdest . > bmdest.xml blastmatrix -cpu 5 bmdest.xml > blastmatrix.ps = Core and pan genome analysis, single linkage clustering = *Input file is created in directory with protein files in FASTA format *The output is a post script file and a table written to the screen = ls -1 *orf.fsa | gawk '{print $1 "\t" $1}' > pancore.list pancoreplot pancore.list = Genome statistics = for x in *orf.fna do genomeStatistics $x >> all.stats done fig5_codonUsage.jpg|Codon usage heatmap Fig4 bias.jpg|Bias in codon positions Fig5 aaUsage.jpg|Amino acid usage heatmap fig6_Matrix.jpg|BLAST matrix proteome comparison fig7_Pancore.jpg|Pan- and core-genome calculation using BLAST, proteome comparison Veillonella parvula DSM 2008.jpg|Genome structure atlas Fig2 rRNAtree.jpg|16S rRNA tree using RNAmmer and ClustalW